
library(logr)
log_open()

#  Change this to match your file system
setwd("C:/Users/tuq69844/Dropbox/anti hate treatments")

#  Packages
library(tidyverse)
library(readxl)
library(lubridate)

#  Script options: turn time-intensive parts of the code on/off
code_URLs_from_scratch      = FALSE
create_summary_from_scratch = FALSE   #  must be T if coding URLs from scratch
if(code_URLs_from_scratch & !create_summary_from_scratch) warning("This won't work. Change create_summary_from_scratch to TRUE.")

#  load the time each respondent began the survey ---
#  created by analysis_survey.R based on raw data
d_survey = read_csv("replication_file/data_survey_time.csv")

#  load the list of HA websites
d_url = read_xlsx("replication_file/list_of_sites.xlsx", col_names = T, sheet = 2)

#############################
###   PART 1 (optional):  ###
###   Create daily data   ###
#############################

#--------------------------------
#  Step 1: load & subset data.

if(create_summary_from_scratch){
  
  #  Load data
  
  d_web_pre  = read_csv("replication_file/excluded_from_public_file/data_web_pre_raw.csv",  guess_max = 20000)
  d_web_post = read_csv("replication_file/excluded_from_public_file/data_web_post_raw.csv", guess_max = 20000)
  
  #  Filter down to subjects who took wave 1
  
  d_web_pre = d_web_pre %>% filter(caseid %in% d_survey$caseid)
  d_web_post = d_web_post %>% filter(caseid %in% d_survey$caseid)
}
  
#--------------------------------------------
#  Step 2: code site visits as HA or not.

if(code_URLs_from_scratch){
  
  #  Remove instances of http, www, common end junk like trailing ? and :1
  
  d_url = d_url %>%
    mutate(
      url_clean = gsub("^http:..|^https:..|www.|/\\?$|:[0-9]$|:[a-e]$|/$", "", URL)
    )
  
  #  Create regular expression strings that detect all URLs for antisemitic
  #  and other HA sites
  
  regex_antisem = paste(c(d_url$url_clean[which(d_url$ANTISEMITIC=="antisemitic")], d_url$username[which(d_url$ANTISEMITIC=="antisemitic" & !is.na(d_url$username))]), collapse = "|")
  regex_other   = paste(c(d_url$url_clean[which(is.na(d_url$ANTISEMITIC))], d_url$username[which(is.na(d_url$ANTISEMITIC) & !is.na(d_url$username))]), collapse = "|")
  
  #  Create variables for HA/antisemitic visits, time spent
  
  d_web_pre = d_web_pre %>%
    mutate(
      url_antisem = as.numeric(grepl(regex_antisem, page_url_anonymized)),
      url_other = as.numeric(grepl(regex_other, page_url_anonymized)),
      
      time_antisem = url_antisem * page_duration,
      time_other   = url_other * page_duration
    )
  
  d_web_post = d_web_post %>%
    mutate(
      url_antisem = as.numeric(grepl(regex_antisem, page_url_anonymized)),
      url_other = as.numeric(grepl(regex_other, page_url_anonymized)),
      
      time_antisem = url_antisem * page_duration,
      time_other   = url_other * page_duration
    )
  
  # Fix false positives
  
  false_pos_pre  = grep("Fuckthealtright", d_web_pre$page_url_anonymized)
  false_pos_post = grep("Fuckthealtright", d_web_post$page_url_anonymized)
  
  if(length(false_pos_pre) > 0){
    d_web_pre$url_antisem[false_pos_pre] = 0
    d_web_pre$time_antisem[false_pos_pre] = 0
  }
  
  if(length(false_pos_post) > 0){
    d_web_post$url_antisem[false_pos_post] = 0
    d_web_post$time_antisem[false_pos_post] = 0
  }
  
  #  Export precoded data
  
  write_csv(
    d_web_pre %>% select(caseid, page_url_anonymized, url_antisem, url_other, time_antisem, time_other), 
    "replication_file/excluded_from_public_file/data_web_pre_precoded.csv"
  )
  write_csv(
    d_web_post %>% select(caseid, page_url_anonymized, url_antisem, url_other, time_antisem, time_other), 
    "replication_file/excluded_from_public_file/data_web_post_precoded.csv"
  )

  #  Only select variables are exported here to keep file size down.
  #  The resulting data will be reloa
  
}

#  If not coding from scratch, load the pre-coded data and add the codes.

if(!code_URLs_from_scratch & create_summary_from_scratch){
  d_url_pre = read_csv("replication_file/excluded_from_public_file/data_web_pre_precoded.csv")
  d_url_post = read_csv("replication_file/excluded_from_public_file/data_web_post_precoded.csv")
  
  d_web_pre = cbind(d_web_pre, d_url_pre[,3:6])
  d_web_post = cbind(d_web_post, d_url_post[,3:6])
}

#----------------------------------------
#  Step 3. Create day x person data

if(create_summary_from_scratch){
  
  #  Prepare and merge survey and browsing data
  
  d_web = bind_rows(d_web_pre, d_web_post)
  
  d_survey = d_survey %>%
    mutate(
      start_date = as.Date(gsub(" .+", "", starttime_w1))
    )
  
  d_web = left_join(d_web, d_survey)
  
  #  Create site visit-level variables that depend
  #  on the survey data in some way
  
  d_web = d_web %>%
    mutate(
      
      #  date+time of site visit (web_date) and start of survey (start_date)
      web_date   = as.Date(gsub(" .+", "", start_time_utc)),
      start_date = as.Date(gsub(" .+", "", starttime_w1)),
      
      #  process them
      starttime_url = start_time_utc,
      starttime_url_sec = as.numeric(starttime_url),
      starttime_w1_sec  = as.numeric(starttime_w1),
      
      #  did the site visit take place before the start of the survey?
      pre_treatment = starttime_w1_sec >= starttime_url_sec,
      
      #  how many 24-hour intervals have passed since the respondent began the survey?
      days_since_began_survey = (starttime_url_sec - starttime_w1_sec) / (60*60*24),
      days_since = floor(days_since_began_survey)
    )
  
  #  create daily data for all outcomes
  d_web_daily = d_web %>%
    group_by(
      caseid, days_since, pre_treatment
    ) %>%
    summarize(
      visits_total = n(),
      visits_HA_antisem = sum(url_antisem),
      visits_HA_other   = sum(url_other),
      visits_HA_total   = visits_HA_antisem + visits_HA_other,
      time_HA_antisem   = sum(time_antisem),
      time_HA_other     = sum(time_other),
      time_HA_total     = time_HA_antisem + time_HA_other,
      number_of_days    = length(unique(days_since))
    ) %>%
    mutate(
      weeks_since = case_when(
        days_since < 0 | pre_treatment == T ~ 0,
        days_since < 7 ~ 1,
        days_since < 14 ~ 2,
        days_since < 21 ~ 3,
        days_since < 28 ~ 4,
        days_since < 35 ~ 5,
        TRUE ~ 6
      )
    )
  
  #  Export & Close the if statement
  
  write_csv(d_web_daily, "replication_file/data_web_daily_precoded.csv")
  
}

####################################################
###  PART 2: Based on daily data, create the     ###
###          weekly and pre/post summaries.      ###
####################################################

d_web_daily = read_csv("replication_file/data_web_daily_precoded.csv")

#  Daily data: add "any" variables

d_web_daily = d_web_daily %>%
  mutate(
    any_HA_antisem = as.numeric(visits_HA_antisem!=0),
    any_HA_other = as.numeric(visits_HA_other!=0),
    any_HA_total = as.numeric(visits_HA_total!=0)
  )

#  Weekly data: aggregate with sum(), then create "any" variables

d_web_weekly = d_web_daily %>%
  group_by(
    caseid, pre_treatment, weeks_since
  ) %>%
  summarize_at(
    vars(visits_HA_antisem:time_HA_total), sum       #  using mean() instead of sum() would make all the variables PER DAY
  ) %>%
  mutate(
    any_HA_antisem = as.numeric(visits_HA_antisem!=0),
    any_HA_other   = as.numeric(visits_HA_other!=0),
    any_HA_total   = as.numeric(visits_HA_total!=0)
  )

#  Pre/post data: same code, different grouping variables

d_web_prepost = d_web_daily %>%
  group_by(
    caseid, pre_treatment
  ) %>%
  summarize_at(
    vars(visits_HA_antisem:time_HA_total), sum       #  using mean() instead of sum() would make all the variables PER DAY
  ) %>%
  mutate(
    any_HA_antisem = as.numeric(visits_HA_antisem!=0),
    any_HA_other = as.numeric(visits_HA_other!=0),
    any_HA_total = as.numeric(visits_HA_total!=0)
  )

#  Add to pre/post: for how many days did we observe everyone?

d_web_prepost = d_web_prepost %>%
  left_join(
    d_web_daily %>%
      group_by(
        caseid, pre_treatment
      ) %>%
      summarize(number_of_days = n())
  )

#  Add logged variables to all datasets

add_logs = function(x){
  cbind(
    x,
    x %>%
      group_by() %>%
      mutate_at(
        vars(visits_HA_antisem:time_HA_total),
        function(x) log(x+1)
      ) %>%
      select(
        visits_HA_antisem:time_HA_total
      ) %>%
      rename_all(function(x) paste0(x, "_log"))
  )
}

d_web_daily   = add_logs(d_web_daily)
d_web_weekly  = add_logs(d_web_weekly)
d_web_prepost = add_logs(d_web_prepost)

#  Add pre-treatment totals to all datasets

add_pretreat = function(x){
  
  add_pre = d_web_prepost %>%
    filter(pre_treatment == T) %>% 
    group_by() %>%
    select(
      caseid, visits_HA_antisem:time_HA_total_log
    ) %>%
    rename_at(vars(-caseid), function(x) paste0(x, "_pre"))
  
  left_join(x, add_pre)
}

d_web_daily   = add_pretreat(d_web_daily)
d_web_weekly  = add_pretreat(d_web_weekly)
d_web_prepost = add_pretreat(d_web_prepost)

#  Export ---------------

write_csv(d_web_daily,   "replication_file/data_web_daily_clean.csv")
write_csv(d_web_weekly,  "replication_file/data_web_weekly_clean.csv")
write_csv(d_web_prepost, "replication_file/data_web_prepost_clean.csv")

#  Appendix 3.9: list of sites ------------

out = d_url %>%
  mutate(
    NAME = gsub(" \\(.+", "", NAME),
    Content = ifelse(!is.na(ANTISEMITIC), "Antisem", "Other")
  ) %>%
  select(
    Name = NAME,
    Content,
    Type = URL_TYPE,
    URL
  ) %>% 
  arrange(Name) 

for(i in nrow(out):2){
  if(out$Name[i] == out$Name[i-1]){
    out$Name[i] = ""
    if(out$Content[i] == out$Content[i-1]){
      out$Content[i] = ""
    } 
  }
}

out$URL = gsub("list=", "list= ", out$URL)
out$URL = gsub("feed/", "feed/ ", out$URL)

out$URL = gsub("aHR0cHM6Ly93d3cub21ueWNvbnRlbnQuY29tL2QvcGxheWxpc3QvNWUyN", "aHR0cHM6Ly93d3cub21ueWNvbnRlbnQuY29tL2QvcGxheWxpc3QvNWUyN ", out$URL)
out$URL = gsub("aHR0cHM6Ly9tZWRpYS56ZW5jYXN0LmZtL2tpbmdtYWtlci1wb2RjYXN0", "aHR0cHM6Ly9tZWRpYS56ZW5jYXN0LmZtL2tpbmdtYWtlci1wb2RjYXN0 ", out$URL)
out$URL = gsub("aHR0cHM6Ly93d3cudGhlcG9saXRpY2FsY2Vzc3Bvb2wub3JnL2ZlZWQvcG9", "aHR0cHM6Ly93d3cudGhlcG9saXRpY2FsY2Vzc3Bvb2wub3JnL2ZlZWQvcG9  ", out$URL)

out$Name = ifelse(out$Name == "", "", paste("ADDBREAK", out$Name))

out = out %>%
  xtable::xtable() %>%
  print(include.rownames = F, include.colnames = F, only.contents = T, hline.after = NULL) #%>%

out = gsub("ADDBREAK", "\\\\\\\\[-1.5ex]", out)

write(
  out,
  "current_draft/appendix_figures/tab_URL.txt"
)

#  Appendix 3.9: top antisem sites ---------------

#  Requires d_web, which is only created if create_summary_from_scratch = TRUE

if(!create_summary_from_scratch) warning("To generate Table SI-60, first re-run the script with create_summary_from_scratch = TRUE.")

if(create_summary_from_scratch){
  summary_out = d_web %>%
    filter(time_antisem > 0) %>%
    group_by(domain)  %>%
    summarize(N_visits = n(), N_users = length(unique(caseid))) %>%
    arrange(-1*N_visits) %>%
    mutate(Percent = (N_visits/sum(N_visits))*100,
           `Excluding 4chan` = (N_visits/sum(N_visits[2:length(N_visits)])*100),
           `Excluding 4chan` = c(NA_real_, `Excluding 4chan`[2:length(`Excluding 4chan`)])) %>%
    filter(`Excluding 4chan` >= 1 | is.na(`Excluding 4chan`)) %>%
    select(Domain = domain, Percent, `Excluding 4chan`) %>%
    xtable::xtable(digits = 1, caption = "Antisemitic site visits by domain.", label = "tab:domain") %>%
    print(include.rownames=F, booktabs=T, caption.placement="top") %>%
    gsub("lllr", "lrrr", .) 
  
  write(summary_out, "current_draft/appendix_figures/tab_domain.txt")
}

log_close()